Part 0: Load packages and data set

# Install and import packages
# Install skimr package for the first time -> install.packages("skimr")

library(ggplot2)
library(dplyr)
library(skimr)
library(plotly)

# import data set and save it in the environment 
vote_data <- read.csv(
  "C:/Users/tobia/Desktop/University/University of St. Gallen/Semester 4/2. Data Analytics 2/R - Exercises/Data/VOTE.csv")

# Using skimr package to get an overview of the data
skim(vote_data)
Data summary
Name vote_data
Number of rows 173
Number of columns 9
_______________________
Column type frequency:
character 1
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
state 0 1 2 2 0 44 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
X 0 1 87.99 50.09 1.00 45.00 88.00 131.00 174.00 ▇▇▇▇▇
district 0 1 8.86 8.75 1.00 3.00 6.00 11.00 42.00 ▇▂▁▁▁
democA 0 1 0.55 0.50 0.00 0.00 1.00 1.00 1.00 ▆▁▁▁▇
voteA 0 1 51.24 19.80 16.00 36.00 50.00 65.00 189.00 ▇▇▁▁▁
expendA 0 1 307.21 280.70 0.30 60.67 241.45 453.46 1470.67 ▇▅▂▁▁
expendB 0 1 304.13 306.23 0.93 60.05 221.53 450.72 1548.19 ▇▃▂▁▁
prtystrA 0 1 49.67 9.96 22.00 44.00 50.00 56.00 71.00 ▁▃▇▆▃
shareA 0 1 50.80 33.60 0.09 18.54 50.82 84.26 99.50 ▇▅▅▅▇
# Removing the district RU from the data frame as it does not represent accurate data -> Over 100% of the votes
vote_data <- vote_data[-173,]
# Additio

# Generating three new variables in the data frame
vote_data <- vote_data %>%
  mutate(expenddiff = expendB - expendA,
         expendA2 = expendA ^ 2,
         expendB2 = expendB ^ 2)

head(vote_data)
##   X state district democA voteA expendA expendB prtystrA   shareA expenddiff
## 1 1    AL        7      1    68 328.296   8.737       41 97.40767 -319.55899
## 2 3    AZ        2      1    73  99.607   3.065       55 97.01476  -96.54200
## 3 4    AZ        3      0    69 319.690  26.281       64 92.40370 -293.40900
## 4 5    AR        3      0    75 159.221  60.054       66 72.61247  -99.16699
## 5 6    AR        4      1    69 570.155  21.393       46 96.38355 -548.76203
## 6 7    CA        2      0    59 696.748 193.915       58 78.22802 -502.83299
##     expendA2     expendB2
## 1 107778.257    76.335177
## 2   9921.555     9.394225
## 3 102201.698   690.690968
## 4  25351.325  3606.483019
## 5 325076.757   457.660434
## 6 485457.756 37603.024621

Here write your comment:
The available data seems to be missing values. For example only 43 states out of 50 are represented, and the districts within each state also seem to be limited as not every single one is shown here.



Part I

Descriptive statistics. Answer the following questions.


(i) Generate two scatter plots showing the relationship between voteA on the vertical axis and expendA, or expendB, on the horizontal axis. Make sure that the plots also contain a line showing the linear relationship of the variables. Interpret their slope.

# HERE: Create your scatterdiagram
ggplot(vote_data, aes(expendA, voteA, color = factor(democA))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  scale_x_log10() +
  labs(title = "Democratic campaign spending vs. % of votes",
       x = "Campaign spending for democratic party, 1000s",
       y = "Percentage of votes for democratic party",
       color = "Rep vs. Dem") +
  scale_color_manual(values = c("red", "blue")) +
  geom_abline(intercept = 50, slope = 0, color = "black", linetype = "dashed") +
  ylim(0, 100)

modelA <- lm(voteA ~ expendA, data = vote_data)
summary(modelA)
## 
## Call:
## lm(formula = voteA ~ expendA, data = vote_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.751 -12.403  -4.326  15.945  32.714 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 43.177094   1.760267  24.529  < 2e-16 ***
## expendA      0.023509   0.004223   5.566 9.97e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.51 on 170 degrees of freedom
## Multiple R-squared:  0.1542, Adjusted R-squared:  0.1492 
## F-statistic: 30.98 on 1 and 170 DF,  p-value: 9.969e-08

Here: write your comment:
Positive linear correlation between expendA and voteA. Intercept at 43.18 and slope at 0.02351, indicating voteA rises with expendA. Low r^2 suggests 15% variance explained. Democratic campaign targets Republican states with blue or red districts. Color scheme shows republican states with blue/red district is a focus for democratic campaign spending.


# HERE: Create your scatterdiagram
ggplot(vote_data, aes(expendB, 100 - voteA, color = factor(democA))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  scale_x_log10() +
  labs(title = "Republican campaign spending vs. % of votes",
       x = "Campaign spending for republican party, 1000s",
       y = "Percentage of votes for republican party",
       color = "Rep vs. Dem") +
  scale_color_manual(values = c("red", "blue")) +
  geom_abline(intercept = 50, slope = 0, color = "black", linetype = "dashed") +
  ylim(0,100)

modelB <- lm(100 - voteA ~ expendB, data = vote_data)
summary(modelB)
## 
## Call:
## lm(formula = 100 - voteA ~ expendB, data = vote_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.243 -11.643  -4.553  13.866  37.525 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.541617   1.644411  25.870  < 2e-16 ***
## expendB      0.023060   0.003808   6.056  8.7e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.29 on 170 degrees of freedom
## Multiple R-squared:  0.1774, Adjusted R-squared:  0.1726 
## F-statistic: 36.67 on 1 and 170 DF,  p-value: 8.701e-09

Here: write your comment:
Republican expenditure (expendB) positively correlates with Republican votes (100 - voteA). Interception at 42.54, slope at 0.02310, with R-Squared at 18%. No significant difference between Democrats and Republicans. Republicans spend less on blue states/districts, focusing on already “red” ones, and those below 40% votes.


(iv) A political campaign manager would like to know by how much the election results of her own party would improve if she invested 10,000$ extra funding in her political campaign. How would you help her, and what is your answer?

# Here: write your code
# Filtering data and selecting NE district 2
pick_district <- vote_data %>%
  filter(voteA < 50 & voteA > 45, democA == 0, state == "NE") %>%
  mutate(add_expendA = expendA + 10)

pick_district
##     X state district democA voteA  expendA expendB prtystrA   shareA expenddiff
## 1 101    NE        2      0    49 1158.294 858.762       58 57.42498  -299.5319
##   expendA2 expendB2 add_expendA
## 1  1341645 737472.2    1168.294

Here: write your comment:
Filtering data reveals a potential swing district (District 2, NE) at 49% in a Republican state. A $10,000 investment may improve it, but the slope (0.023) suggests a minor increase to 49.23%. More investment recommended for significant impact on the district.

(v) The same political campaign manager is from Texas. She is concerned that your results do not account for the particular electoral situation in Texas. What would you do to advise her? What is your answer? What are potential limitations of your answer?

# Here: write your code
# Creating a dataframe by state aggregate state values from combining the district values
state_data <- vote_data %>%
  group_by(state) %>%
  summarize(count = n(),
            avg_state_vote = sum(voteA) / count,
            state_shareA = sum(shareA) / count,
            state_expendA = sum(expendA),
            state_expendB = sum(expendB))

# Putting these values into a plot and displaying the difference between the two parties
plt <- ggplot(state_data, aes(state, avg_state_vote)) + 
        geom_col(fill = ifelse(state_data$avg_state_vote >= 50, "blue", "red")) +
        theme_light() +
        labs(title = "Blue vs. Red States",
             y = "State vote averaged",
             x = "States") +
        ylim(0,100) + 
        geom_abline(intercept = 50, slope = 0, color = "black", linetype = "dashed")

ggplotly(plt)
# Another graoh that shows the individual additional 10k investment
Texas <- vote_data[vote_data$state == "TX",]
# Now plot a graph to visualize it
texas10k <- 10
Texas$expendA[4] <- Texas$expendA[4] + texas10k
Texas_added10k <- Texas
#Here we calculated the new shareA
Texas_added10k$shareA[4] <- 55.82588
ggplot(Texas_added10k, aes(expendA, voteA, color = factor(democA))) +
  geom_point() +
  geom_text(aes(label = district), hjust = 0, vjust = -1) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  scale_x_log10() +
  labs(title = "Effect on district 13 after 10K additional investment",
       x = "Campaign spending for democratic party, 1000s",
       y = "Percentage of votes for democratic party",
       color = "Rep vs. Dem") +
  scale_color_manual(values = c("red", "blue")) +
  geom_abline(intercept = 50, slope = 0, color = "black", linetype = "dotted") +
  ylim(0,100)

Here: write your comment:
There are only 43 states in the dataset, there should be 50. Not all districts within a state are listed. Therefore our findings are limited by the dataset available and would have to be catious in advising any extra spending, as Texas is an an important state in the elections.